Based on:
https://github.com/synthetichealth/synthea
https://github.com/Neo4jSolutions/patient-journey-model/tree/master/ingest
https://github.com/ccdgui/Patient_Flows_Sankey
import numpy as np
import re
import datetime
from py2neo import Graph, Node
import pandas as pd

import importlib
import secrets
db = Graph(scheme="bolt", host=secrets.host, port=secrets.port, secure=False,
auth=(secrets.user, secrets.password))
query="""MATCH (c:Condition) <-[:HAS_CONDITION]-(e)
return c.description, count(e) as total
order by total desc
limit 10
"""
df=db.run(query).to_data_frame()
df
| c.description | total | |
|---|---|---|
| 0 | Normal pregnancy | 3772 |
| 1 | Viral sinusitis (disorder) | 1161 |
| 2 | Preeclampsia | 1140 |
| 3 | Suspected COVID-19 | 968 |
| 4 | COVID-19 | 935 |
| 5 | Fever (finding) | 843 |
| 6 | Acute viral pharyngitis (disorder) | 699 |
| 7 | Cough (finding) | 629 |
| 8 | Acute bronchitis (disorder) | 546 |
| 9 | Loss of taste (finding) | 475 |
query="""MATCH (c:Condition {description:"Suspected COVID-19"}) <-[:HAS_CONDITION]-(e)-[:HAS_ENCOUNTER]-(patient)
return count(patient)
"""
df=db.run(query).to_data_frame()
df
| count(patient) | |
|---|---|
| 0 | 968 |
query="""MATCH (c:Condition {description:"Suspected COVID-19"}) <-[:HAS_CONDITION]-(e)
with e
MATCH (patient)-[:HAS_ENCOUNTER]-(e)-[:NEXT*]->(e2)-[:HAS_CONDITION|:HAS_DRUG|:HAS_CARE_PLAN|:HAS_ALLERGY|:HAS_PROCEDURE]->(x)
WHERE e2.date <= ( e.date + duration("P60D") )
OPTIONAL MATCH (e2)-[:HAS_END]->(end)
RETURN labels(x)[0] AS eventType, x.description AS name,
e2.date AS startDate,coalesce(end.date, "NA") AS endDate, id(patient) as patient, e2.isEnd as isEnd
ORDER BY startDate
"""
df=db.run(query).to_data_frame()
len(df)
27723
df=df.drop_duplicates(["eventType","name","startDate"])
df['patient'].unique()[:5]
array([ 9374, 12464, 12468, 1012, 9372])
df.loc[df['patient']==32055]
| eventType | name | startDate | endDate | patient | isEnd | |
|---|---|---|---|---|---|---|
| 334 | CarePlan | Infectious disease care plan (record artifact) | 2020-02-16T21:22:59.000000000+00:00 | 2020-02-16T00:00:00.000000000+00:00 | 32055 | False |
| 338 | Procedure | Face mask (physical object) | 2020-02-16T21:22:59.000000000+00:00 | 2020-02-16T00:00:00.000000000+00:00 | 32055 | False |
| 342 | Condition | Headache (finding) | 2020-02-16T21:22:59.000000000+00:00 | 2020-02-16T00:00:00.000000000+00:00 | 32055 | False |
| 346 | Condition | COVID-19 | 2020-02-16T21:22:59.000000000+00:00 | 2020-02-16T00:00:00.000000000+00:00 | 32055 | False |
| 350 | Condition | Suspected COVID-19 | 2020-02-16T21:22:59.000000000+00:00 | 2020-02-16T00:00:00.000000000+00:00 | 32055 | False |
| 354 | Condition | Fatigue (finding) | 2020-02-16T21:22:59.000000000+00:00 | 2020-02-16T00:00:00.000000000+00:00 | 32055 | False |
| 358 | Condition | Cough (finding) | 2020-02-16T21:22:59.000000000+00:00 | 2020-02-16T00:00:00.000000000+00:00 | 32055 | False |
| 362 | Condition | Fever (finding) | 2020-02-16T21:22:59.000000000+00:00 | 2020-02-16T00:00:00.000000000+00:00 | 32055 | False |
| 15598 | Condition | Fever (finding) | 2020-03-13T00:00:00.000000000+00:00 | NA | 32055 | True |
| 15599 | Condition | Headache (finding) | 2020-03-13T00:00:00.000000000+00:00 | NA | 32055 | True |
| 15600 | Condition | Cough (finding) | 2020-03-13T00:00:00.000000000+00:00 | NA | 32055 | True |
| 15601 | Condition | Fatigue (finding) | 2020-03-13T00:00:00.000000000+00:00 | NA | 32055 | True |
| 15602 | Condition | COVID-19 | 2020-03-13T00:00:00.000000000+00:00 | NA | 32055 | True |
| 19790 | Drug | 72 HR Fentanyl 0.025 MG/HR Transdermal System | 2020-03-21T21:22:59.000000000+00:00 | NA | 32055 | True |
| 19791 | Drug | Acetaminophen 325 MG / Oxycodone Hydrochloride... | 2020-03-21T21:22:59.000000000+00:00 | NA | 32055 | True |
| 19792 | Drug | buprenorphine 2 MG / naloxone 0.5 MG Sublingua... | 2020-03-21T21:22:59.000000000+00:00 | NA | 32055 | True |
df=df[df['eventType']=="Condition"]
df['startDate']=df['startDate'].apply(lambda x: pd.to_datetime(str(x.year)+"-"+str(x.month)+"-"+str(x.day)))
for p in df.patient.unique():
df.loc[df['patient']==p,'delta']=df.loc[df['patient']==p,'startDate']-df.loc[df['patient']==p,'startDate'].shift(1)
df['delta']=df['delta'].apply(lambda x: pd.Timedelta(x).days)
df['event']=np.nan
for p in df.patient.unique():
df.loc[df['patient']==p,'event']=np.where(df.loc[df['patient']==p,'delta']>60,1,0)
df.loc[df['patient']==p,'event']=df.loc[df['patient']==p,'event'].cumsum()
df.loc[df['patient']==34941]
| eventType | name | startDate | endDate | patient | isEnd | delta | event | |
|---|---|---|---|---|---|---|---|---|
| 1593 | Condition | Suspected COVID-19 | 2020-02-24 | 2020-02-24T00:00:00.000000000+00:00 | 34941 | False | NaN | 0.0 |
| 1597 | Condition | COVID-19 | 2020-02-24 | 2020-02-24T00:00:00.000000000+00:00 | 34941 | False | 0.0 | 0.0 |
| 1601 | Condition | Chill (finding) | 2020-02-24 | 2020-02-24T00:00:00.000000000+00:00 | 34941 | False | 0.0 | 0.0 |
| 1605 | Condition | Loss of taste (finding) | 2020-02-24 | 2020-02-24T00:00:00.000000000+00:00 | 34941 | False | 0.0 | 0.0 |
| 1609 | Condition | Fever (finding) | 2020-02-24 | 2020-02-24T00:00:00.000000000+00:00 | 34941 | False | 0.0 | 0.0 |
| 1613 | Condition | Sore throat symptom (finding) | 2020-02-24 | 2020-02-24T00:00:00.000000000+00:00 | 34941 | False | 0.0 | 0.0 |
| 1617 | Condition | Fatigue (finding) | 2020-02-24 | 2020-02-24T00:00:00.000000000+00:00 | 34941 | False | 0.0 | 0.0 |
| 19815 | Condition | Chill (finding) | 2020-03-22 | NA | 34941 | True | 27.0 | 0.0 |
| 19816 | Condition | COVID-19 | 2020-03-22 | NA | 34941 | True | 0.0 | 0.0 |
| 19817 | Condition | Sore throat symptom (finding) | 2020-03-22 | NA | 34941 | True | 0.0 | 0.0 |
| 19818 | Condition | Fever (finding) | 2020-03-22 | NA | 34941 | True | 0.0 | 0.0 |
| 19819 | Condition | Loss of taste (finding) | 2020-03-22 | NA | 34941 | True | 0.0 | 0.0 |
| 19820 | Condition | Fatigue (finding) | 2020-03-22 | NA | 34941 | True | 0.0 | 0.0 |
remove=['Anemia (disorder)',
'Normal pregnancy',
'Preeclampsia',
'Otitis media',
'Fracture of ankle',
'Laceration of hand',
'Antepartum eclampsia',
'Sprain of wrist',
'Concussion with no loss of consciousness', 'Laceration of thigh',
'Hypertriglyceridemia (disorder)',
'Fetus with unknown complication',
'Miscarriage in first trimester']
df=df.loc[~df['name'].isin(remove)]
df.name.unique()
array(['Loss of taste (finding)', 'COVID-19', 'Fever (finding)',
'Suspected COVID-19', 'Cough (finding)',
'Sore throat symptom (finding)', 'Joint pain (finding)',
'Muscle pain (finding)', 'Fatigue (finding)',
'Diarrhea symptom (finding)', 'Headache (finding)',
'Hypoxemia (disorder)', 'Acute respiratory failure (disorder)',
'Pneumonia (disorder)', 'Sepsis caused by virus (disorder)',
'Respiratory distress (finding)', 'Sputum finding (finding)',
'Dyspnea (finding)', 'Wheezing (finding)',
'Acute pulmonary embolism (disorder)',
'Acute deep venous thrombosis (disorder)',
'Nasal congestion (finding)', 'Chill (finding)',
'Nausea (finding)', 'Vomiting symptom (finding)',
'Viral sinusitis (disorder)', 'Hemoptysis (finding)',
'Acute viral pharyngitis (disorder)',
'Acquired coagulation disorder (disorder)',
'Acute respiratory distress syndrome (disorder)',
'Injury of kidney (disorder)', 'Septic shock (disorder)',
'Injury of heart (disorder)', 'Contact dermatitis',
'Body mass index 30+ - obesity (finding)',
'Coronary Heart Disease', 'Diabetes',
'Streptococcal sore throat (disorder)',
'Acute bronchitis (disorder)',
'Escherichia coli urinary tract infection'], dtype=object)
df['event'].unique()
array([0.])
df.loc[df['patient']==32060]
| eventType | name | startDate | endDate | patient | isEnd | delta | event | |
|---|---|---|---|---|---|---|---|---|
| 1240 | Condition | Suspected COVID-19 | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | NaN | 0.0 |
| 1244 | Condition | Loss of taste (finding) | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | 0.0 | 0.0 |
| 1248 | Condition | Fever (finding) | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | 0.0 | 0.0 |
| 1252 | Condition | COVID-19 | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | 0.0 | 0.0 |
| 1256 | Condition | Headache (finding) | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | 0.0 | 0.0 |
| 1260 | Condition | Fatigue (finding) | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | 0.0 | 0.0 |
| 1264 | Condition | Wheezing (finding) | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | 0.0 | 0.0 |
| 1268 | Condition | Cough (finding) | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | 0.0 | 0.0 |
| 1272 | Condition | Dyspnea (finding) | 2020-02-22 | 2020-02-22T00:00:00.000000000+00:00 | 32060 | False | 0.0 | 0.0 |
| 14720 | Condition | Fever (finding) | 2020-03-12 | NA | 32060 | True | 19.0 | 0.0 |
| 14721 | Condition | COVID-19 | 2020-03-12 | NA | 32060 | True | 0.0 | 0.0 |
| 14722 | Condition | Cough (finding) | 2020-03-12 | NA | 32060 | True | 0.0 | 0.0 |
| 14723 | Condition | Loss of taste (finding) | 2020-03-12 | NA | 32060 | True | 0.0 | 0.0 |
| 14724 | Condition | Dyspnea (finding) | 2020-03-12 | NA | 32060 | True | 0.0 | 0.0 |
| 14725 | Condition | Fatigue (finding) | 2020-03-12 | NA | 32060 | True | 0.0 | 0.0 |
| 14726 | Condition | Headache (finding) | 2020-03-12 | NA | 32060 | True | 0.0 | 0.0 |
| 14727 | Condition | Wheezing (finding) | 2020-03-12 | NA | 32060 | True | 0.0 | 0.0 |
df=df.sort_values(["patient","startDate","event"])
df['idx']=np.nan
for p in df.patient.unique():
for e in df.loc[df['patient']==p,"event"].unique():
df.loc[(df['patient']==p) & (df['event']==e),'idx']=df.loc[(df['patient']==p ) & (df['event']==e)].groupby("startDate").ngroup()
df[df['idx'].isna()]
| eventType | name | startDate | endDate | patient | isEnd | delta | event | idx |
|---|
df.loc[df['patient']==34119]
| eventType | name | startDate | endDate | patient | isEnd | delta | event | idx | |
|---|---|---|---|---|---|---|---|---|---|
| 1002 | Condition | Sputum finding (finding) | 2020-02-20 | 2020-03-07T00:00:00.000000000+00:00 | 34119 | False | NaN | 0.0 | 0.0 |
| 1006 | Condition | Fever (finding) | 2020-02-20 | 2020-03-07T00:00:00.000000000+00:00 | 34119 | False | 0.0 | 0.0 | 0.0 |
| 1010 | Condition | Suspected COVID-19 | 2020-02-20 | 2020-03-07T00:00:00.000000000+00:00 | 34119 | False | 0.0 | 0.0 | 0.0 |
| 1014 | Condition | COVID-19 | 2020-02-20 | 2020-03-07T00:00:00.000000000+00:00 | 34119 | False | 0.0 | 0.0 | 0.0 |
| 1018 | Condition | Fatigue (finding) | 2020-02-20 | 2020-03-07T00:00:00.000000000+00:00 | 34119 | False | 0.0 | 0.0 | 0.0 |
| 1022 | Condition | Sore throat symptom (finding) | 2020-02-20 | 2020-03-07T00:00:00.000000000+00:00 | 34119 | False | 0.0 | 0.0 | 0.0 |
| 8155 | Condition | COVID-19 | 2020-03-07 | NA | 34119 | True | 16.0 | 0.0 | 1.0 |
| 8156 | Condition | Fever (finding) | 2020-03-07 | NA | 34119 | True | 0.0 | 0.0 | 1.0 |
| 8157 | Condition | Fatigue (finding) | 2020-03-07 | NA | 34119 | True | 0.0 | 0.0 | 1.0 |
| 8158 | Condition | Sore throat symptom (finding) | 2020-03-07 | NA | 34119 | True | 0.0 | 0.0 | 1.0 |
| 8159 | Condition | Sputum finding (finding) | 2020-03-07 | NA | 34119 | True | 0.0 | 0.0 | 1.0 |
# if patient has both suspected covid and covid in same idx remove suspected
for p in df.patient.unique():
for e in df.loc[df['patient']==p]['event'].unique():
for i in df.loc[(df['patient']==p) & (df['event']==e)]['idx'].unique():
c=[x for x in df.loc[(df['patient']==p) & (df['idx']==i) & (df['event']==e)]['name'].values if x
in ['COVID-19','Suspected COVID-19']]
if len(c)==2:
df.loc[(df['patient']==p) & (df['idx']==i) & (df['event']==e)] = df.loc[(df['patient']==p) & (df['idx']==i) & (df['name']!='Suspected COVID-19')]
df=df[~df['idx'].isna()]
df.loc[df['patient']==34947]
| eventType | name | startDate | endDate | patient | isEnd | delta | event | idx | |
|---|---|---|---|---|---|---|---|---|---|
| 5802 | Condition | COVID-19 | 2020-03-04 | 2020-03-04T00:00:00.000000000+00:00 | 34947.0 | False | NaN | 0.0 | 0.0 |
| 5806 | Condition | Sputum finding (finding) | 2020-03-04 | 2020-03-04T00:00:00.000000000+00:00 | 34947.0 | False | 0.0 | 0.0 | 0.0 |
| 5814 | Condition | Fever (finding) | 2020-03-04 | 2020-03-04T00:00:00.000000000+00:00 | 34947.0 | False | 0.0 | 0.0 | 0.0 |
| 18379 | Condition | Fever (finding) | 2020-03-19 | NA | 34947.0 | True | 15.0 | 0.0 | 1.0 |
| 18380 | Condition | COVID-19 | 2020-03-19 | NA | 34947.0 | True | 0.0 | 0.0 | 1.0 |
| 18381 | Condition | Sputum finding (finding) | 2020-03-19 | NA | 34947.0 | True | 0.0 | 0.0 | 1.0 |
df['idx']=df['idx']+1
for p in df.patient.unique():
for e in df[df['patient']==p]['event'].unique():
new_date=(df.loc[(df['patient']==p) & (df['event']==e),"startDate"].head(1) - datetime.timedelta(5))
df=df.append({"eventType":"Condition",
"name": "Suspected COVID-19",
"startDate": new_date.item(),
"endDate": np.nan,
"patient": p,
"isEnd": "False",
"delta": 0,
"event":e,
"idx":0},ignore_index=True)
df=df.sort_values(["patient","startDate","event","idx"])
df.patient.nunique()
484
import itertools
from collections import defaultdict
mydict=defaultdict(list)
for p in df.patient.unique():
for e in df.loc[df['patient']==p]['event'].unique():
for i in df.idx.unique()[:-1]:
pid=str(p)+"_"+str(e)
mydict[pid] += list(itertools.product(
df[(df['patient']==p) & (df['event']==e) & (df['idx']==i)]['name']+"_"+str(int(i)),
df[(df['patient']==p) & (df['event']==e) & (df['idx']==i+1)]['name']+"_"+str(int(i+1))))
len(mydict)
484
output_values=list(mydict.values())
len(output_values)
484
from collections import Counter
frequency = dict(Counter(x for xs in output_values for x in set(xs)))
sankey = {"links": [], "nodes": []}
for i, y in frequency.items(): #links are created first, from items of frequency dictionary
link = dict(
source = str(i[0]),
target = str(i[1]),
value = y,
)
sankey["links"].append(link)
check_node = [link[x] for x in ['source', 'target']] #nodes derived from links 'source' and 'target'
for x in check_node: #append a new node, only if it does not already exists
if not any(d.get('name', None) == x for d in sankey["nodes"]):
name = dict(
name = x,
station = re.sub('[^a-zA-Z]+', '', x),
step = re.sub('[^0-9]+', '', x)
)
sankey["nodes"].append(name)
sorted_nodes = sorted(sankey['nodes'], key=lambda k: (k['step']))
for w, node in enumerate(sorted_nodes):
node['id'] = w
node['color'] = 'rgba(31, 119, 180, 0.8)'
l=len(set([x['station'] for x in sorted_nodes]))
l
40
cols=['rgb(215,48,39)','rgb(244,109,67)','rgb(253,174,97)','rgb(254,224,144)','rgb(255,255,191)','rgb(224,243,248)','rgb(171,217,233)','rgb(116,173,209)','rgb(69,117,180)',
'rgb(197,27,125)','rgb(222,119,174)','rgb(241,182,218)','rgb(253,224,239)','rgb(247,247,247)','rgb(230,245,208)','rgb(184,225,134)','rgb(127,188,65)','rgb(77,146,33)',
'rgb(255,247,236)','rgb(254,232,200)','rgb(253,212,158)','rgb(253,187,132)','rgb(252,141,89)','rgb(239,101,72)','rgb(215,48,31)','rgb(179,0,0)','rgb(127,0,0)',
'rgb(178,24,43)','rgb(214,96,77)','rgb(244,165,130)','rgb(253,219,199)','rgb(247,247,247)','rgb(209,229,240)','rgb(146,197,222)','rgb(67,147,195)','rgb(33,102,172)',
'rgb(140,81,10)','rgb(191,129,45)','rgb(223,194,125)','rgb(246,232,195)','rgb(245,245,245)','rgb(199,234,229)','rgb(128,205,193)','rgb(53,151,143)','rgb(1,102,94)']
len(cols)
45
color_dict=[{x[0]:x[1] } for x in list(zip(set([x['station'] for x in sorted_nodes]), cols[:l]))]
from collections import ChainMap
data = dict(ChainMap(*color_dict))
def id_lookup(node, sorted_list):
for item in sorted_list:
if item['name'] == node['source']:
return item['id']
for d in sankey['links']:
d['source_id'] = id_lookup(d, sorted_nodes)
sorted_links = sorted(sankey['links'], key=lambda k: (k['source_id']))
nodes = dict(
label = [node['name'] for node in sorted_nodes],
color = [data[node['station']] for node in sorted_nodes]
)
link = dict(
source = [nodes["label"].index(link['source']) for link in sorted_links ],
target = [nodes["label"].index(link['target']) for link in sorted_links ],
value = [link['value'] for link in sorted_links]
)
data = dict(nodes=nodes,
link=link)
import plotly as py
from plotly.offline import iplot
data_trace = dict(
type='sankey',
domain = dict(
x = [0,1],
y = [0,1]
),
orientation = "h",
valueformat = ".0f",
valuesuffix = "Patients",
node = dict(
pad = 5,
thickness = 10,
line = dict(
color = "black",
width = 0.5
),
label = data["nodes"]["label"],
color = data["nodes"]["color"]
),
link = dict(
source = data["link"]["source"],
target = data["link"]["target"],
value = data["link"]["value"],
label = data["nodes"]["label"]
)
)
layout = dict(
title = "Patient Flow Analysis - 60 Days of Condititions after Suspected Covid-19 Observation",
width=1000,
height=1000,
font = dict(
size = 10 )
)
fig = dict(data=[data_trace], layout=layout)
py.offline.iplot(fig, validate = False)
## add static png, interactive version doesn't show up on github
!ls
cfg.py preeclampsia-sankey.py config.yml __pycache__ covid19-sankey.ipynb pyingest covid-19-sankey.png README.md data sankey.png one-patient-journey.png schema.png patient-journey-2.png secrets.py preeclampsia-sankey.csv synthea preeclampsia-sankey.html synthea-preeclampsia-filter.ipynb preeclampsia-sankey.ipynb

import plotly.graph_objects as go
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = data["nodes"]["label"],
color = data["nodes"]["color"]
),
link = dict(
source = data["link"]["source"],
target = data["link"]["target"],
value = data["link"]["value"],
label = data["nodes"]["label"]
))])
fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()